Prg 1 - Develop a program to create histograms for all numerical features and analyze the distribution of each feature. Generate box plots for all numerical features and identify any outliers. Use California Housing dataset.¶

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
In [45]:
df = pd.read_csv(r"./housing.csv")
In [46]:
df.head()
Out[46]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [47]:
df.shape
Out[47]:
(20640, 10)
In [48]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [49]:
df.isnull().sum()
Out[49]:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64
In [50]:
df.duplicated().sum()
Out[50]:
0
In [51]:
df['total_bedrooms'].median()
Out[51]:
435.0
In [52]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
In [53]:
for i in df.iloc[:,2:7]:
    df[i] = df[i].astype('int')
In [54]:
df.head()
Out[54]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41 880 129 322 126 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21 7099 1106 2401 1138 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52 1467 190 496 177 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52 1274 235 558 219 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52 1627 280 565 259 3.8462 342200.0 NEAR BAY
In [55]:
df.describe().T
Out[55]:
count mean std min 25% 50% 75% max
longitude 20640.0 -119.569704 2.003532 -124.3500 -121.8000 -118.4900 -118.01000 -114.3100
latitude 20640.0 35.631861 2.135952 32.5400 33.9300 34.2600 37.71000 41.9500
housing_median_age 20640.0 28.639486 12.585558 1.0000 18.0000 29.0000 37.00000 52.0000
total_rooms 20640.0 2635.763081 2181.615252 2.0000 1447.7500 2127.0000 3148.00000 39320.0000
total_bedrooms 20640.0 536.838857 419.391878 1.0000 297.0000 435.0000 643.25000 6445.0000
population 20640.0 1425.476744 1132.462122 3.0000 787.0000 1166.0000 1725.00000 35682.0000
households 20640.0 499.539680 382.329753 1.0000 280.0000 409.0000 605.00000 6082.0000
median_income 20640.0 3.870671 1.899822 0.4999 2.5634 3.5348 4.74325 15.0001
median_house_value 20640.0 206855.816909 115395.615874 14999.0000 119600.0000 179700.0000 264725.00000 500001.0000
In [56]:
Numericals = df.select_dtypes(include=[np.number]).columns
print(Numericals)
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')
In [21]:
for col in Numericals:
    plt.figure(figsize=(10,6))
    df[col].plot(kind="hist", title=col, bins=60, edgecolor="black")
    plt.ylabel("Frequency")
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [57]:
for col in Numericals:
    # print(df[col])
    plt.figure(figsize=(6,6))
    sns.boxplot(data=df[col], color="blue")
    plt.title(col)
    plt.ylabel(col)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Prg 2 - Develop a program to Compute the correlation matrix to understand the relationships between pairs of features. Visualize the correlation matrix using a heatmap to know which variables have strong positive/negative correlations. Create a pair plot to visualize pairwise relationships between features. Use California Housing dataset.¶

In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

import warnings
warnings.filterwarnings("ignore")
In [326]:
data = fetch_california_housing()
In [327]:
df = pd.DataFrame(data.data, columns=data.feature_names)
In [328]:
df['Target'] = data.target
In [329]:
df.head()
Out[329]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude Target
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422
In [330]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
In [331]:
df.describe()
Out[331]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude Target
count 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean 3.870671 28.639486 5.429000 1.096675 1425.476744 3.070655 35.631861 -119.569704 2.068558
std 1.899822 12.585558 2.474173 0.473911 1132.462122 10.386050 2.135952 2.003532 1.153956
min 0.499900 1.000000 0.846154 0.333333 3.000000 0.692308 32.540000 -124.350000 0.149990
25% 2.563400 18.000000 4.440716 1.006079 787.000000 2.429741 33.930000 -121.800000 1.196000
50% 3.534800 29.000000 5.229129 1.048780 1166.000000 2.818116 34.260000 -118.490000 1.797000
75% 4.743250 37.000000 6.052381 1.099526 1725.000000 3.282261 37.710000 -118.010000 2.647250
max 15.000100 52.000000 141.909091 34.066667 35682.000000 1243.333333 41.950000 -114.310000 5.000010
In [332]:
df.isnull().sum()
Out[332]:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64
In [333]:
df.duplicated().sum()
Out[333]:
0
In [334]:
plt.figure(figsize=(12,8))
df.hist(figsize=(12,8), bins=30, edgecolor="black")
plt.suptitle("Feature Distribution", fontsize=16)
plt.show()
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
In [336]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.suptitle("Boxplots of Features to Identify Outliers", fontsize=16)
plt.show()
No description has been provided for this image
In [339]:
plt.figure(figsize=(10,6))
corr_matrix = df.corr()
sns.heatmap(data=corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.suptitle("Feature Correlation Heatmap", fontsize=16)
plt.show()
No description has been provided for this image
In [81]:
sns.pairplot(data=df[["MedInc", "HouseAge", "AveRooms", "Target"]], diag_kind="kde")
plt.show()
No description has been provided for this image

Prg 3 - Develop a program to implement Principal Component Analysis (PCA) for reducing the dimensionality of theIris dataset from 4 features to 2.¶

In [340]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
In [341]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
In [342]:
iris = datasets.load_iris()
In [343]:
X = iris.data
y = iris.target
In [344]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [345]:
cov_matrix = np.cov(X_scaled.T)
In [346]:
cov_matrix
Out[346]:
array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])
In [347]:
evalues, evectors = np.linalg.eig(cov_matrix)
In [348]:
evalues
Out[348]:
array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])
In [355]:
evectors
Out[355]:
array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])
In [350]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
colors = ["red", "green", "blue"]
labels = iris.target_names
for i in range(len(colors)):
    ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
ax.set_xlabel("Sepal Length")
ax.set_ylabel("Sepal Width")
ax.set_zlabel("Petal Length")
ax.set_title("3D Visualization of Iris Data Before PCA")
plt.legend()
plt.show()
No description has been provided for this image
In [130]:
U, S, Vt = np.linalg.svd(X_scaled, full_matrices=False)
In [131]:
S
Out[131]:
array([20.92306556, 11.7091661 ,  4.69185798,  1.76273239])
In [132]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
In [133]:
explained_var = pca.explained_variance_ratio_
In [134]:
print(f"Explained Variance by PC1: {explained_var[0]:.2f}")
print(f"Explained Variance by PC2: {explained_var[1]:.2f}")
Explained Variance by PC1: 0.73
Explained Variance by PC2: 0.23
In [135]:
plt.figure(figsize=(8, 6))
for i in range(len(colors)):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=colors[i], label=labels[i])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Iris Dataset (Dimensionality Reduction)')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image
In [136]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
for i in range(len(colors)):
    ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
for i in range(3): # Plot first three eigenvectors
    ax.quiver(0, 0, 0, evectors[i, 0], evectors[i, 1], evectors[i, 2], color='black', length=1)
ax.set_xlabel('Sepal Length')
ax.set_ylabel('Sepal Width')
ax.set_zlabel('Petal Length')
ax.set_title('3D Data with Eigenvectors')
plt.legend()
plt.show()
No description has been provided for this image

Prg 4 - For a given set of training data examples stored in a .CSV file, implement and demonstrate the Find-S algorithm to output a description of the set of all hypotheses consistent with the training examples.¶

In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
In [138]:
data = pd.read_csv(r"./training_data2.csv")
In [139]:
data
Out[139]:
Experience Qualification Skill Age Hired
0 Yes Masters Python 30 Yes
1 Yes Bachelors Python 25 Yes
2 No Bachelors Java 28 No
3 Yes Masters Java 40 Yes
4 No Masters Python 35 No
In [140]:
def find_s_alg(data):
    attr = data.iloc[:, :-1].values
    target = data.iloc[:, -1].values
    for i in range(len(target)):
        if target[i] == "Yes":
            hypo = attr[i].copy()
            break
    for i in range(len(target)):
        if target[i] == "Yes":
            for j in range(len(hypo)):
                if hypo[j] != attr[i][j]:
                    hypo[j] = "?"
    return hypo
In [142]:
final_hypo = find_s_alg(data)
final_hypo
Out[142]:
array(['Yes', '?', '?', '?'], dtype=object)

Prg 5 - Develop a program to implement k-Nearest Neighbour algorithm to classify the randomly generated 100 values of x in the range of [0,1]. Perform the following based on dataset generated.¶

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
In [144]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
In [146]:
np.random.seed(42)
values = np.random.rand(100)
In [147]:
labels = []
for i in values[:50]:
    if i <=0.5:
        labels.append('Class1')
    else:
        labels.append('Class2')
In [148]:
labels += [None] * 50
In [150]:
print(labels)
['Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1', 'Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
In [151]:
data = {
    "Point": [f"x{i+1}" for i in range(100)],
    "Value": values,
    "Label": labels
}
In [152]:
df = pd.DataFrame(data)
In [153]:
df.head()
Out[153]:
Point Value Label
0 x1 0.374540 Class1
1 x2 0.950714 Class2
2 x3 0.731994 Class2
3 x4 0.598658 Class2
4 x5 0.156019 Class1
In [154]:
df.nunique()
Out[154]:
Point    100
Value    100
Label      2
dtype: int64
In [155]:
df.shape
Out[155]:
(100, 3)
In [156]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Point   100 non-null    object 
 1   Value   100 non-null    float64
 2   Label   50 non-null     object 
dtypes: float64(1), object(2)
memory usage: 2.5+ KB
In [158]:
df.describe().T
Out[158]:
count mean std min 25% 50% 75% max
Value 100.0 0.470181 0.297489 0.005522 0.193201 0.464142 0.730203 0.986887
In [159]:
df.isnull().sum()
Out[159]:
Point     0
Value     0
Label    50
dtype: int64
In [162]:
num_col = df.select_dtypes(include=['int', 'float']).columns
df[num_col].hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()
No description has been provided for this image
In [176]:
labelled_df = df[df["Label"].notna()]
X_train = labelled_df[["Value"]]
y_train = labelled_df["Label"]
In [185]:
unlabelled_df = df[df["Label"].isna()]
X_test = unlabelled_df[["Value"]]
In [186]:
true_labels = ["Class1" if x <= 0.5 else "Class2" for x in values[50:]]
In [187]:
k_values = [1, 2, 3, 4, 5, 20, 30]
results = {}
accuracies = {}
In [188]:
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    results[k] = pred
    acc = accuracy_score(true_labels, pred) * 100
    accuracies[k] = acc
    print(f"Accuracy for k={k}: {acc:.2f}%")
    # Assign predictions back to the DataFrame for this k
    unlabelled_df[f"Label_k{k}"] = pred
Accuracy for k=1: 100.00%
Accuracy for k=2: 100.00%
Accuracy for k=3: 98.00%
Accuracy for k=4: 98.00%
Accuracy for k=5: 98.00%
Accuracy for k=20: 98.00%
Accuracy for k=30: 100.00%
In [189]:
print(results)
{1: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 2: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 3: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 4: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 5: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 20: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object), 30: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
       'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
       'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
       'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
       'Class1', 'Class1'], dtype=object)}
In [194]:
df1 = unlabelled_df.drop(columns=["Label"], axis=1)
df1
Out[194]:
Point Value Label_k1 Label_k2 Label_k3 Label_k4 Label_k5 Label_k20 Label_k30
50 x51 0.969585 Class2 Class2 Class2 Class2 Class2 Class2 Class2
51 x52 0.775133 Class2 Class2 Class2 Class2 Class2 Class2 Class2
52 x53 0.939499 Class2 Class2 Class2 Class2 Class2 Class2 Class2
53 x54 0.894827 Class2 Class2 Class2 Class2 Class2 Class2 Class2
54 x55 0.597900 Class2 Class2 Class2 Class2 Class2 Class2 Class2
55 x56 0.921874 Class2 Class2 Class2 Class2 Class2 Class2 Class2
56 x57 0.088493 Class1 Class1 Class1 Class1 Class1 Class1 Class1
57 x58 0.195983 Class1 Class1 Class1 Class1 Class1 Class1 Class1
58 x59 0.045227 Class1 Class1 Class1 Class1 Class1 Class1 Class1
59 x60 0.325330 Class1 Class1 Class1 Class1 Class1 Class1 Class1
60 x61 0.388677 Class1 Class1 Class1 Class1 Class1 Class1 Class1
61 x62 0.271349 Class1 Class1 Class1 Class1 Class1 Class1 Class1
62 x63 0.828738 Class2 Class2 Class2 Class2 Class2 Class2 Class2
63 x64 0.356753 Class1 Class1 Class1 Class1 Class1 Class1 Class1
64 x65 0.280935 Class1 Class1 Class1 Class1 Class1 Class1 Class1
65 x66 0.542696 Class2 Class2 Class2 Class2 Class2 Class2 Class2
66 x67 0.140924 Class1 Class1 Class1 Class1 Class1 Class1 Class1
67 x68 0.802197 Class2 Class2 Class2 Class2 Class2 Class2 Class2
68 x69 0.074551 Class1 Class1 Class1 Class1 Class1 Class1 Class1
69 x70 0.986887 Class2 Class2 Class2 Class2 Class2 Class2 Class2
70 x71 0.772245 Class2 Class2 Class2 Class2 Class2 Class2 Class2
71 x72 0.198716 Class1 Class1 Class1 Class1 Class1 Class1 Class1
72 x73 0.005522 Class1 Class1 Class1 Class1 Class1 Class1 Class1
73 x74 0.815461 Class2 Class2 Class2 Class2 Class2 Class2 Class2
74 x75 0.706857 Class2 Class2 Class2 Class2 Class2 Class2 Class2
75 x76 0.729007 Class2 Class2 Class2 Class2 Class2 Class2 Class2
76 x77 0.771270 Class2 Class2 Class2 Class2 Class2 Class2 Class2
77 x78 0.074045 Class1 Class1 Class1 Class1 Class1 Class1 Class1
78 x79 0.358466 Class1 Class1 Class1 Class1 Class1 Class1 Class1
79 x80 0.115869 Class1 Class1 Class1 Class1 Class1 Class1 Class1
80 x81 0.863103 Class2 Class2 Class2 Class2 Class2 Class2 Class2
81 x82 0.623298 Class2 Class2 Class2 Class2 Class2 Class2 Class2
82 x83 0.330898 Class1 Class1 Class1 Class1 Class1 Class1 Class1
83 x84 0.063558 Class1 Class1 Class1 Class1 Class1 Class1 Class1
84 x85 0.310982 Class1 Class1 Class1 Class1 Class1 Class1 Class1
85 x86 0.325183 Class1 Class1 Class1 Class1 Class1 Class1 Class1
86 x87 0.729606 Class2 Class2 Class2 Class2 Class2 Class2 Class2
87 x88 0.637557 Class2 Class2 Class2 Class2 Class2 Class2 Class2
88 x89 0.887213 Class2 Class2 Class2 Class2 Class2 Class2 Class2
89 x90 0.472215 Class1 Class1 Class1 Class1 Class1 Class1 Class1
90 x91 0.119594 Class1 Class1 Class1 Class1 Class1 Class1 Class1
91 x92 0.713245 Class2 Class2 Class2 Class2 Class2 Class2 Class2
92 x93 0.760785 Class2 Class2 Class2 Class2 Class2 Class2 Class2
93 x94 0.561277 Class2 Class2 Class2 Class2 Class2 Class2 Class2
94 x95 0.770967 Class2 Class2 Class2 Class2 Class2 Class2 Class2
95 x96 0.493796 Class1 Class1 Class2 Class2 Class2 Class2 Class1
96 x97 0.522733 Class2 Class2 Class2 Class2 Class2 Class2 Class2
97 x98 0.427541 Class1 Class1 Class1 Class1 Class1 Class1 Class1
98 x99 0.025419 Class1 Class1 Class1 Class1 Class1 Class1 Class1
99 x100 0.107891 Class1 Class1 Class1 Class1 Class1 Class1 Class1

Prg 6 - Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select appropriate data set for your experiment and draw graphs¶

In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
In [224]:
def g_kernel(x, x_query, tau):
    return np.exp(- (x - x_query) ** 2 / (2 * tau ** 2))
In [225]:
def lwr(X, y, x_query, tau):
    X_b = np.c_[np.ones(len(X)), X]
    x_query_b = np.array([1, x_query])
    W = np.diag(g_kernel(X, x_query, tau))
    theta = np.linalg.inv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
    return x_query_b @ theta
In [226]:
X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 1.3, 3.75, 2.25])
In [227]:
x_query = 3
tau = 1
In [228]:
y_pred = lwr(X, y, x_query, tau)
In [229]:
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.scatter(x_query, y_pred, color='red', label=f'Prediction at x={x_query}')
weights = g_kernel(X, x_query, tau)
for i in range(len(X)):
    plt.plot([X[i], X[i]], [y[i], y[i] - weights[i]], 'k-', lw=1)
    plt.scatter(X[i], y[i], s=weights[i] * 200, color='green', alpha=0.5)
plt.title("Locally Weighted Regression (LWR)")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
No description has been provided for this image
In [212]:
from sklearn.linear_model import LinearRegression
In [233]:
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau = 1.0 # Bandwidth parameter
# Compute LWR predictions
y_lwr = np.array([lwr(X, y, x_q, tau) for x_q in X_query])
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
plt.plot(X_query, y_lwr, color='red', label='Locally Weighted Regression')
plt.title("Comparison: Simple Linear Regression vs. Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
No description has been provided for this image
In [238]:
# Complex Dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau_values = [0.1, 0.5, 1.0, 5.0, 10.0] # Different bandwidth values
def lwr(X, y, x_query, tau):
    X_b = np.c_[np.ones(len(X)), X] # Add bias term (Intercept)
    x_query_b = np.array([1, x_query]) # Query point with bias term
    W = np.diag(gaussian_kernel(X, x_query, tau)) # Compute weights
    # Compute theta using pseudo-inverse to avoid singular matrix error
    theta = np.linalg.pinv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
    return x_query_b @ theta # Return prediction
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(12, 8))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
# Plot LWR for different tau values
colors = ['red', 'green', 'purple', 'orange', 'brown']
for tau, color in zip(tau_values, colors):
    y_lwr = np.array([lwr(X, y, x_q, tau) for x_q in X_query])
    plt.plot(X_query, y_lwr, color=color, label=f'LWR (τ={tau})')
plt.title("Effect of Different τ Values in Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
No description has been provided for this image

Prg 8 - Develop a program to demonstrate the working of the decision tree algorithm. Use Breast Cancer Data set for building the decision tree and apply this knowledge to classify a new sample.¶

In [239]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
In [240]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
In [241]:
data = pd.read_csv(r'./WisconsinBreastCancerdataset.csv')
In [242]:
pd.set_option('display.max_columns', None)
In [243]:
data.head()
Out[243]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN
In [244]:
data.shape
Out[244]:
(569, 33)
In [245]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
In [246]:
data.diagnosis.unique()
Out[246]:
array(['M', 'B'], dtype=object)
In [247]:
data.duplicated().sum()
Out[247]:
0
In [248]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)
In [251]:
df['diagnosis'] = df['diagnosis'].map({"M": 1, "B": 0}) # Malignant:1, Benign:0
In [253]:
df.describe().T
Out[253]:
count mean std min 25% 50% 75% max
diagnosis 569.0 0.372583 0.483918 0.000000 0.000000 0.000000 1.000000 1.00000
radius_mean 569.0 14.127292 3.524049 6.981000 11.700000 13.370000 15.780000 28.11000
texture_mean 569.0 19.289649 4.301036 9.710000 16.170000 18.840000 21.800000 39.28000
perimeter_mean 569.0 91.969033 24.298981 43.790000 75.170000 86.240000 104.100000 188.50000
area_mean 569.0 654.889104 351.914129 143.500000 420.300000 551.100000 782.700000 2501.00000
smoothness_mean 569.0 0.096360 0.014064 0.052630 0.086370 0.095870 0.105300 0.16340
compactness_mean 569.0 0.104341 0.052813 0.019380 0.064920 0.092630 0.130400 0.34540
concavity_mean 569.0 0.088799 0.079720 0.000000 0.029560 0.061540 0.130700 0.42680
concave points_mean 569.0 0.048919 0.038803 0.000000 0.020310 0.033500 0.074000 0.20120
symmetry_mean 569.0 0.181162 0.027414 0.106000 0.161900 0.179200 0.195700 0.30400
fractal_dimension_mean 569.0 0.062798 0.007060 0.049960 0.057700 0.061540 0.066120 0.09744
radius_se 569.0 0.405172 0.277313 0.111500 0.232400 0.324200 0.478900 2.87300
texture_se 569.0 1.216853 0.551648 0.360200 0.833900 1.108000 1.474000 4.88500
perimeter_se 569.0 2.866059 2.021855 0.757000 1.606000 2.287000 3.357000 21.98000
area_se 569.0 40.337079 45.491006 6.802000 17.850000 24.530000 45.190000 542.20000
smoothness_se 569.0 0.007041 0.003003 0.001713 0.005169 0.006380 0.008146 0.03113
compactness_se 569.0 0.025478 0.017908 0.002252 0.013080 0.020450 0.032450 0.13540
concavity_se 569.0 0.031894 0.030186 0.000000 0.015090 0.025890 0.042050 0.39600
concave points_se 569.0 0.011796 0.006170 0.000000 0.007638 0.010930 0.014710 0.05279
symmetry_se 569.0 0.020542 0.008266 0.007882 0.015160 0.018730 0.023480 0.07895
fractal_dimension_se 569.0 0.003795 0.002646 0.000895 0.002248 0.003187 0.004558 0.02984
radius_worst 569.0 16.269190 4.833242 7.930000 13.010000 14.970000 18.790000 36.04000
texture_worst 569.0 25.677223 6.146258 12.020000 21.080000 25.410000 29.720000 49.54000
perimeter_worst 569.0 107.261213 33.602542 50.410000 84.110000 97.660000 125.400000 251.20000
area_worst 569.0 880.583128 569.356993 185.200000 515.300000 686.500000 1084.000000 4254.00000
smoothness_worst 569.0 0.132369 0.022832 0.071170 0.116600 0.131300 0.146000 0.22260
compactness_worst 569.0 0.254265 0.157336 0.027290 0.147200 0.211900 0.339100 1.05800
concavity_worst 569.0 0.272188 0.208624 0.000000 0.114500 0.226700 0.382900 1.25200
concave points_worst 569.0 0.114606 0.065732 0.000000 0.064930 0.099930 0.161400 0.29100
symmetry_worst 569.0 0.290076 0.061867 0.156500 0.250400 0.282200 0.317900 0.66380
fractal_dimension_worst 569.0 0.083946 0.018061 0.055040 0.071460 0.080040 0.092080 0.20750
In [254]:
X = df.drop('diagnosis', axis=1) # Drop the 'diagnosis' column (target)
y = df['diagnosis']
In [255]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [256]:
model = DecisionTreeClassifier(criterion='entropy') #criteria = gini, entropy
model.fit(X_train, y_train)
model
Out[256]:
DecisionTreeClassifier(criterion='entropy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy')
In [263]:
import math 

def entropy(column):
    counts = column.value_counts()
    prob = counts / len(column)
    return -sum(prob * prob.apply(math.log2))

def condition_entropy(data, X, target):
    feature_values = data[X].unique()
    we = 0
    for value in feature_values:
        subset = data[data[X] == value]
        we += (len(subset) / len(data)) * entropy(subset[target])
    return we
def information_gained(data, X, target):
    total_e = entropy(data[target])
    we = condition_entropy(data, X, target)
    return total_e - we

for feature in X:
    ig = information_gained(df, feature, "diagnosis")
    print(f"Information Gain for {feature}: {ig}")
Information Gain for radius_mean: 0.8607815854835991
Information Gain for texture_mean: 0.8357118798482908
Information Gain for perimeter_mean: 0.9267038614138748
Information Gain for area_mean: 0.9280305529818247
Information Gain for smoothness_mean: 0.7761788341876101
Information Gain for compactness_mean: 0.9091291689709926
Information Gain for concavity_mean: 0.9350604299589776
Information Gain for concave points_mean: 0.9420903069361305
Information Gain for symmetry_mean: 0.735036638169654
Information Gain for fractal_dimension_mean: 0.8361770160635639
Information Gain for radius_se: 0.9337337383910278
Information Gain for texture_se: 0.8642965239721755
Information Gain for perimeter_se: 0.9315454914704012
Information Gain for area_se: 0.925377169845925
Information Gain for smoothness_se: 0.9350604299589776
Information Gain for compactness_se: 0.9231889229252984
Information Gain for concavity_se: 0.9280305529818247
Information Gain for concave points_se: 0.8585933385629725
Information Gain for symmetry_se: 0.8181371874054084
Information Gain for fractal_dimension_se: 0.9174857375160954
Information Gain for radius_worst: 0.9003074642106167
Information Gain for texture_worst: 0.8634349686194988
Information Gain for perimeter_worst: 0.8985843535052632
Information Gain for area_worst: 0.9350604299589776
Information Gain for smoothness_worst: 0.7197189097252679
Information Gain for compactness_worst: 0.9183472928687721
Information Gain for concavity_worst: 0.9302187999024514
Information Gain for concave points_worst: 0.9148323543801957
Information Gain for symmetry_worst: 0.8453951399613433
Information Gain for fractal_dimension_worst: 0.8915544765281104
In [272]:
plt.figure(figsize=(22, 16))
plot_tree(model, filled=True, feature_names=X.columns, class_names=['Benign', 'Malignant'])
plt.show()
No description has been provided for this image
In [265]:
y_pred = model.predict(X_test)
y_pred
Out[265]:
array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1], dtype=int64)
In [266]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred) * 100
classification_rep = classification_report(y_test, y_pred)
# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
Accuracy: 93.85964912280701
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95        71
           1       0.95      0.88      0.92        43

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114

In [267]:
df.head(1)
Out[267]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 1 17.99 10.38 122.8 1001.0 0.1184 0.2776 0.3001 0.1471 0.2419 0.07871 1.095 0.9053 8.589 153.4 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.6 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.1189
In [268]:
new = [[12.5, 19.2, 80.0, 500.0, 0.085, 0.1, 0.05, 0.02, 0.17, 0.06,
    0.4, 1.0, 2.5, 40.0, 0.006, 0.02, 0.03, 0.01, 0.02, 0.003,
    16.0, 25.0, 105.0, 900.0, 0.13, 0.25, 0.28, 0.12, 0.29, 0.08]]
y_pred = model.predict(new)
# Output the prediction (0 = Benign, 1 = Malignant)
if y_pred[0] == 0:
    print("Prediction: Benign")
else:
    print("Prediction: Malignant")
Prediction: Benign

Prg 9 - Develop a program to implement the Naive Bayesian classifier considering Olivetti Face Data set for training. Compute the accuracy of the classifier, considering a few test data sets.¶

In [273]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [274]:
from sklearn.datasets import fetch_olivetti_faces
data = fetch_olivetti_faces()
In [275]:
data.keys()
Out[275]:
dict_keys(['data', 'images', 'target', 'DESCR'])
In [276]:
print("Data Shape:", data.data.shape)
print("Target Shape:", data.target.shape)
print("There are {} unique persons in the dataset".format(len(np.unique(data.target))))
print("Size of each image is {}x{}".format(data.images.shape[1],data.images.shape[1]))
Data Shape: (400, 4096)
Target Shape: (400,)
There are 40 unique persons in the dataset
Size of each image is 64x64
In [277]:
def print_faces(images, target, top_n):
    # Ensure the number of images does not exceed available data
    top_n = min(top_n, len(images))
    # Set up figure size based on the number of images
    grid_size = int(np.ceil(np.sqrt(top_n)))
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.2, wspace=0.2)
    for i, ax in enumerate(axes.ravel()):
        if i < top_n:
            ax.imshow(images[i], cmap='bone')
            ax.axis('off')
            ax.text(2, 12, str(target[i]), fontsize=9, color='red')
            ax.text(2, 55, f"face: {i}", fontsize=9, color='blue')
        else:
            ax.axis('off')
    plt.show()
In [278]:
print_faces(data.images, data.target, 400) 
No description has been provided for this image
In [284]:
def display_unique_faces(pics):
    fig = plt.figure(figsize=(24, 10)) # Set figure size
    columns, rows = 10, 4 # Define grid dimensions
    # Loop through grid positions and plot each image
    for i in range(1, columns * rows + 1):
        img_index = 10 * i - 1 # Calculate the image index
        if img_index < pics.shape[0]: # Check for valid image index
            img = pics[img_index, :, :]
            ax = fig.add_subplot(rows, columns, i)
            ax.imshow(img, cmap='gray')
            ax.set_title(f"Person {i}", fontsize=14)
            ax.axis('off')
    plt.suptitle("There are 40 distinct persons in the dataset", fontsize=24)
    plt.show()
In [285]:
display_unique_faces(data.images)
No description has been provided for this image
In [286]:
from sklearn.model_selection import train_test_split
X = data.data
Y = data.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)
x_train:  (280, 4096)
x_test:  (120, 4096)
In [287]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
# Train the model
nb = GaussianNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
nb_accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Display accuracy result
print(f"Naive Bayes Accuracy: {nb_accuracy}%")
Confusion Matrix:
[[3 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 4]]
Naive Bayes Accuracy: 74.17%
In [288]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Initialize and fit Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Multinomial Naive Bayes Accuracy: {accuracy}%")
Multinomial Naive Bayes Accuracy: 75.83%
In [294]:
# Calculate the number of misclassified images
misclassified_idx = np.where(y_pred != y_test)[0]
num_misclassified = len(misclassified_idx)
# Print the number of misclassified images and accuracy
print(f"Number of misclassified images: {num_misclassified}")
print(f"Total images in test set: {len(y_test)}")
print(f"Accuracy: {round((1 - num_misclassified / len(y_test)) * 100, 2)}%")
# Visualize some of the misclassified images
n_misclassified_to_show = min(num_misclassified, 5) # Show up to 5 misclassified i
plt.figure(figsize=(10, 5))
for i in range(n_misclassified_to_show):
    idx = misclassified_idx[i]
    plt.subplot(1, n_misclassified_to_show, i + 1)
    plt.imshow(x_test[idx].reshape(64, 64), cmap='gray')
    plt.title(f"True: {y_test[idx]}, Pred: {y_pred[idx]}")
    plt.axis('off')
plt.show()
Number of misclassified images: 29
Total images in test set: 120
Accuracy: 75.83%
No description has been provided for this image
In [295]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
# Binarize the test labels
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
# Get predicted probabilities for each class
y_pred_prob = nb.predict_proba(x_test)
# Calculate and print AUC for each class
for i in range(y_test_bin.shape[1]):
    roc_auc = roc_auc_score(y_test_bin[:, i], y_pred_prob[:, i])
    print(f"Class {i} AUC: {roc_auc:.2f}")
Class 0 AUC: 0.69
Class 1 AUC: 1.00
Class 2 AUC: 0.99
Class 3 AUC: 0.98
Class 4 AUC: 0.97
Class 5 AUC: 1.00
Class 6 AUC: 1.00
Class 7 AUC: 0.89
Class 8 AUC: 1.00
Class 9 AUC: 1.00
Class 10 AUC: 1.00
Class 11 AUC: 1.00
Class 12 AUC: 0.99
Class 13 AUC: 1.00
Class 14 AUC: 1.00
Class 15 AUC: 1.00
Class 16 AUC: 0.50
Class 17 AUC: 0.18
Class 18 AUC: 0.42
Class 19 AUC: 0.87
Class 20 AUC: 0.31
Class 21 AUC: 0.68
Class 22 AUC: 0.20
Class 23 AUC: 0.44
Class 24 AUC: 0.25
Class 25 AUC: 0.55
Class 26 AUC: 0.76
Class 27 AUC: 0.75
Class 28 AUC: 0.51
Class 29 AUC: 0.70
Class 30 AUC: 0.50
Class 31 AUC: 0.19
Class 32 AUC: 0.41
Class 33 AUC: 0.41
Class 34 AUC: 0.22
Class 35 AUC: 0.04
Class 36 AUC: 0.87
Class 37 AUC: 0.82
Class 38 AUC: 0.51

Prg 10 - Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set and visualize the clustering result.¶

In [296]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')
In [297]:
data = pd.read_csv(r"./WisconsinBreastCancerdataset.csv")
In [298]:
data.head()
Out[298]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN
In [299]:
data.shape
Out[299]:
(569, 33)
In [300]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
In [301]:
data.diagnosis.unique()
Out[301]:
array(['M', 'B'], dtype=object)
In [302]:
data.isnull().sum()
Out[302]:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: int64
In [303]:
data.duplicated().sum()
Out[303]:
0
In [304]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)
In [305]:
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0}) # Malignant:1, Benign:0
In [306]:
df.describe().T
Out[306]:
count mean std min 25% 50% 75% max
diagnosis 569.0 0.372583 0.483918 0.000000 0.000000 0.000000 1.000000 1.00000
radius_mean 569.0 14.127292 3.524049 6.981000 11.700000 13.370000 15.780000 28.11000
texture_mean 569.0 19.289649 4.301036 9.710000 16.170000 18.840000 21.800000 39.28000
perimeter_mean 569.0 91.969033 24.298981 43.790000 75.170000 86.240000 104.100000 188.50000
area_mean 569.0 654.889104 351.914129 143.500000 420.300000 551.100000 782.700000 2501.00000
smoothness_mean 569.0 0.096360 0.014064 0.052630 0.086370 0.095870 0.105300 0.16340
compactness_mean 569.0 0.104341 0.052813 0.019380 0.064920 0.092630 0.130400 0.34540
concavity_mean 569.0 0.088799 0.079720 0.000000 0.029560 0.061540 0.130700 0.42680
concave points_mean 569.0 0.048919 0.038803 0.000000 0.020310 0.033500 0.074000 0.20120
symmetry_mean 569.0 0.181162 0.027414 0.106000 0.161900 0.179200 0.195700 0.30400
fractal_dimension_mean 569.0 0.062798 0.007060 0.049960 0.057700 0.061540 0.066120 0.09744
radius_se 569.0 0.405172 0.277313 0.111500 0.232400 0.324200 0.478900 2.87300
texture_se 569.0 1.216853 0.551648 0.360200 0.833900 1.108000 1.474000 4.88500
perimeter_se 569.0 2.866059 2.021855 0.757000 1.606000 2.287000 3.357000 21.98000
area_se 569.0 40.337079 45.491006 6.802000 17.850000 24.530000 45.190000 542.20000
smoothness_se 569.0 0.007041 0.003003 0.001713 0.005169 0.006380 0.008146 0.03113
compactness_se 569.0 0.025478 0.017908 0.002252 0.013080 0.020450 0.032450 0.13540
concavity_se 569.0 0.031894 0.030186 0.000000 0.015090 0.025890 0.042050 0.39600
concave points_se 569.0 0.011796 0.006170 0.000000 0.007638 0.010930 0.014710 0.05279
symmetry_se 569.0 0.020542 0.008266 0.007882 0.015160 0.018730 0.023480 0.07895
fractal_dimension_se 569.0 0.003795 0.002646 0.000895 0.002248 0.003187 0.004558 0.02984
radius_worst 569.0 16.269190 4.833242 7.930000 13.010000 14.970000 18.790000 36.04000
texture_worst 569.0 25.677223 6.146258 12.020000 21.080000 25.410000 29.720000 49.54000
perimeter_worst 569.0 107.261213 33.602542 50.410000 84.110000 97.660000 125.400000 251.20000
area_worst 569.0 880.583128 569.356993 185.200000 515.300000 686.500000 1084.000000 4254.00000
smoothness_worst 569.0 0.132369 0.022832 0.071170 0.116600 0.131300 0.146000 0.22260
compactness_worst 569.0 0.254265 0.157336 0.027290 0.147200 0.211900 0.339100 1.05800
concavity_worst 569.0 0.272188 0.208624 0.000000 0.114500 0.226700 0.382900 1.25200
concave points_worst 569.0 0.114606 0.065732 0.000000 0.064930 0.099930 0.161400 0.29100
symmetry_worst 569.0 0.290076 0.061867 0.156500 0.250400 0.282200 0.317900 0.66380
fractal_dimension_worst 569.0 0.083946 0.018061 0.055040 0.071460 0.080040 0.092080 0.20750
In [307]:
#dropped the Diagnosis (target) since clustering is unsupervised.
df.drop(columns=["diagnosis"], inplace=True) # Removing Target
In [308]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
In [309]:
# Apply PCA for Dimensionality Reduction
pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)
In [310]:
# Check explained variance ratio
explained_variance = pca.explained_variance_ratio_
total_explained_variance = np.sum(explained_variance)
print(f"Variance explained by PC1: {explained_variance[0]:.4f}")
print(f"Variance explained by PC2: {explained_variance[1]:.4f}")
print(f"Total variance explained by first 2 components: {total_explained_variance:.4f}")
Variance explained by PC1: 0.4427
Variance explained by PC2: 0.1897
Total variance explained by first 2 components: 0.6324
In [313]:
wcss = [] # Within-Cluster Sum of Squares
K_range = range(1,11)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)
In [314]:
# Plot the Elbow Method Graph
plt.figure(figsize=(8, 5))
plt.plot(K_range, wcss, marker="o", linestyle="-")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS")
plt.title("Elbow Method to Find Optimal k")
plt.show()
No description has been provided for this image
In [316]:
#Apply K-Means Clustering with the optimal k (usually where elbow occurs, k=2)
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)
In [324]:
# Step 7: Visualize the Clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-Means Clustering after PCA")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]: